This document will explore the implementation of various types of plots that can be produced with base R graphics and/or the ggplot2 package. For each plot type we begin by showing basic plots and then enhance the plots by combining plot types, adding new variables using faceting, color, shape or size, or customizing colors, axes, axis labels, annotations or styles.
library(tidyverse)
library(vioplot)
library(ggmosaic)
library(scales) # provides alpha() function to set alpha for base R plots
library(hexbin)
library(RColorBrewer)
library(ggplot2)
library(dplyr)
library(viridis)
library(geojsonio)
library(kableExtra) # For nice looking tables in html output
Use the Iris dataset. Show distribution of petal width.
hist(iris$Petal.Width,
main = "Iris Petal Width Histogram",
xlab = "Petal Width")
hist(iris$Petal.Width,
main = "Iris Petal Width Histogram", xlab = "Petal Width", col='pink', breaks=20, xlim=c(0, 2.9), ylim=c(0, 40))
#Subsetting the different species
set <- iris$Sepal.Width[iris$Species == "setosa"]
ver <- iris$Sepal.Width[iris$Species == "versicolor"]
vir <- iris$Sepal.Width[iris$Species == "virginica"]
par(mfrow = c(3, 1))
hist(set, main = "Iris Setosa Petal Width", xlab = " Petal Width (Cm)", ylab="Frequency", col='orange', xlim=c(1.5, 6.5))
hist(ver, main = "Iris Versicolor Petal Width", xlab = " Petal Width (Cm)", ylab="Frequency", col='orange', xlim=c(1.5, 6.5))
hist(vir, main = "Iris Virginica Petal Width", xlab = " Petal Width (Cm)", ylab="Frequency", col='orange', xlim=c(1.5, 5.5))
ggplot(iris, aes(x=Petal.Width)) +
geom_histogram()
Changing the bins, color, outline color, adding title, and adding x and y labels
ggplot(iris, aes(x=Petal.Width)) +
geom_histogram(color="Blue", fill="orange", bins = 20) +
ggtitle("Iris Petal Width Histogram") +
xlab("Petal Width") +
ylab("Frequency")
ggplot(iris, aes(x=Petal.Width)) +
geom_histogram(color="yellow", aes(fill=Species), bins = 20) +
ggtitle("Iris Petal Width Histogram") +
xlab("Petal Width") +
ylab("Frequency")
ggplot(iris, aes(x=Petal.Width)) +
geom_histogram(color="blue", fill="orange", bins = 20) +
ggtitle("Iris Petal Width Histogram") +
xlab("Petal Width") + ylab("Frequency") +
facet_grid(. ~ Species)
#this is for more color
rf <- colorRampPalette(rev(brewer.pal(11,'Spectral')))
r <- rf(32)
#this is the actual setup of the plot
MM <- read.csv('data/MarchMadness.csv')
x <- MM$Pointdiff
y <- MM$Seed
dataframe <- data.frame(x, y)
h <- hexbin(dataframe)
plot(h, colramp = rf, xlab = "Point Differential", ylab = "Seed")
#this is for more color
rf <- colorRampPalette(rev(brewer.pal(11,'Spectral')))
#this is the actual setup of the plot
MM <- read.csv('data/MarchMadness.csv') %>%
mutate(Seeddiff = abs(Seed - Seed.1))
x <- MM$Seeddiff
y <- MM$Pointdiff
h<- hexbin(x, y, xbins = 20)
plot(h, colramp = rf,
xlab = "Seed Differential",
ylab = "Point Differential",
main = "Seed Differential vs Point Differential")
geom_hex()ggplot(MM, aes(x=Seeddiff, y=Pointdiff) ) +
geom_hex(bins = 30) +
scale_fill_continuous(type = "viridis") +
labs(x = "Seed Differential",
y = "Point Differential",
title = "Seed Differential vs Point Differential") +
theme_bw()
fit <- lm(Pointdiff ~ Seeddiff, data = MM)
r2 <- round(summary(fit)$r.squared, 2)
slope <- round(summary(fit)$coefficients["Seeddiff","Estimate"], 2)
ggplot(MM, aes(x=Seeddiff, y=Pointdiff)) +
geom_point(size = 3,
alpha = 0.20,
color = "steelblue") +
geom_smooth(method = lm,
se = FALSE,
linetype = "solid",
color = "red") +
labs(x = "Seed Differential",
y = "Point Differential",
title = "Higher Seed Differentials Are Associated with Higher Point Differentials") +
annotate("text", x = 1, y = 52,
label = as.expression(substitute(atop(R^2==r2, beta==slope),
list(r2=r2, slope=slope))),
vjust = "middle", hjust = "left") +
theme_bw()
## Warning in is.na(x): is.na() applied to non-(list or vector) of type
## 'expression'
MM %>%
group_by(Seeddiff) %>%
summarize(AvgPtDiff = round(mean(Pointdiff), 2)) %>%
kbl(align = "l") %>%
kable_styling()
| Seeddiff | AvgPtDiff |
|---|---|
| 0 | 8.69 |
| 1 | 9.22 |
| 2 | 11.27 |
| 3 | 9.43 |
| 4 | 9.52 |
| 5 | 10.20 |
| 6 | 7.89 |
| 7 | 9.97 |
| 8 | 12.01 |
| 9 | 11.16 |
| 10 | 11.38 |
| 11 | 13.10 |
| 12 | 13.20 |
| 13 | 17.07 |
| 15 | 24.90 |
Hmm… It seems like there isn’t much difference in average point differential for seed differentials ranging from 2 to 10. Let’s rerun the scatterplot and regression for that range of seed differentials.
MM_sub <- MM %>%
filter(between(Seeddiff, 2, 10))
fit <- lm(Pointdiff ~ Seeddiff, data = MM_sub)
r2 <- round(summary(fit)$r.squared, 2)
slope <- round(summary(fit)$coefficients["Seeddiff","Estimate"], 2)
ggplot(MM_sub, aes(x=Seeddiff, y=Pointdiff)) +
geom_point(size = 3,
alpha = 0.20,
color = "steelblue") +
geom_smooth(method = lm,
se = FALSE,
linetype = "solid",
color = "red") +
labs(x = "Seed Differential",
y = "Point Differential",
title = "Moderate Seed Differentials Don't Seem Very Important") +
annotate("text", x = 2, y = 45,
label = as.expression(substitute(atop(R^2==r2, beta==slope),
list(r2=r2, slope=slope))),
vjust = "middle", hjust = "left") +
scale_x_continuous(breaks = 2:10, limits = c(2, 10)) +
theme_bw()
## Warning in is.na(x): is.na() applied to non-(list or vector) of type
## 'expression'
# get a dataset from R
# Orange dataset has information about the growth of orange trees
orange <- Orange
# create a density plot
plot(density(orange$circumference),
main = "CIRCUMFERENCE OF ORANGE TREES",
xlab = "CIRCUMFERENCE",
ylab = "DISTRIBUTION")
polygon(density(orange$circumference), col=7, border=2)
# create the density plot
orange %>%
ggplot( aes(x = circumference)) +
geom_density(fill= 7, color= 2)
### Adding a Density Plot to a Histogram Use the same histogram created in the “Customized Histogram Using Base R” section and overlay a density curve on it.
x <- iris$Petal.Width
h <- hist(x, breaks=20, col="pink", freq=FALSE, xlab= "Petal Width", main="Histogram With Density Curve")
lines(density(x), lwd=2, col="blue")
###Base R
### Create a data set to work with
survey <- c(apple=45, orange=25, honeydew=35, banana=50, watermelon=20, blueberry=15)
### Create a barchart with titles and color
barplot(survey,
main="Favorite Fruit Survey",
xlab="Fruit",
ylab="Count",
col=c("red", "orange", "green", "yellow", "pink", "blue"))
### Add a legend
legend("topright",
legend = c("apple", "orange", "honeydew", "banana", "watermelon", "blueberry"),
col = c("red", "orange", "green", "yellow", "pink", "blue"),
pch = 19,
bty = "n",
pt.cex = 2,
cex = 1.2)
# Use the same data set created in the base r example but make it into a data frame
survey1 <- data.frame(
fruit=c("apple", "orange", "honeydew", "banana", "watermelon", "blueberry"),
value=c(45, 25, 35, 50, 20, 15))
# Create the barchart
ggplot(survey1, aes(x=fruit, y=value)) + geom_bar(stat="identity")
# Add color with black outlines
ggplot(survey1, aes(x=fruit, y=value)) + geom_bar(stat="identity", fill=c("red", "orange", "green", "yellow", "pink", "blue"), color="black") + labs(x="Count", y="Fruit", title="Fruit Survey") + theme(legend.position = "none")
### Bar plot with data added above the bars (Base R) First, let’s read in some data on passengers who were aboard the Titanic.
titanic <- read.csv("data/titanic.csv")
Next, calculate survival percentage and class count for each passenger class, using the aggregate() function, and then merge those two results into a data frame.
# calculate survival percentage by class and count by class.
surv_pct <- aggregate(Survived ~ Pclass, data = titanic, FUN = mean)
class_ct <- aggregate(Survived ~ Pclass, data = titanic, FUN = length)
# Merge those two results into a single data frame, and specify
# column names
by_class_data <- merge(surv_pct, class_ct,
by = "Pclass")
colnames(by_class_data) <- c("Pclass", "Surv_Pct", "Count")
#str(by_class_data)
Create the barplot. Note that we increase the ylim parameter a bit to leave room for labels above the bars. We also set axes = FALSE so that the default y-axis is not plotted. Note also that we assign the barplot object to a variable. What gets assigned to the variable is the x positions of the midpoints of each of the bars on the plot. We will use those x positions later to position the labels above the bars, since the x position for a text label specifies the center of the label.
In the next step we create a custom y-axis with the axis() function. The at parameter specifies the placement of the tick marks on the axis (“axis ticks”) and the labels parameter specifies the tick labels. Use the paste() function to make percentage labels.
Finally, we create the labels showing the count of passengers in each class and place them right above the bars. The x position comes from the barplot object and the y position is just the height of the bars plus a little extra so that the text labels don’t overlap the bars.
plt <- barplot(by_class_data$Surv_Pct,
names.arg = paste("Class", by_class_data$Pclass),
col = 'steelblue',
main = "Survival Percentage by Passenger Class",
ylab = "Survival Percentage",
xlab = "Passenger Class",
ylim = c(0.0, 0.75),
axes = FALSE
)
# Add y axis
axis(2, at = seq(0.0, 0.7, by = 0.1),
labels = paste(seq(0, 70, by = 10), "%", sep=""),
las = 1,
cex.axis = 0.9)
# Add the text of the labels
text(x = plt,
y = by_class_data$Surv_Pct + 0.04,
labels = paste("n: ", by_class_data$Count, sep=""),
cex=1)
First, let’s read in some data on passengers who were aboard the Titanic. For the ggplot2 example we will use the tidyverse approach.
titanic <- read_csv("data/titanic.csv")
Next, calculate survival percentage and class count for each passenger class.
survival <- titanic %>%
group_by(Pclass) %>%
summarize(Count = n(),
Surv_Pct = mean(Survived))
Make the plot.
ggplot(survival, aes(x = Pclass, y = Surv_Pct)) +
geom_col(fill = "steelblue") +
geom_text(aes(label = paste("n: ", Count, sep="")), vjust = -0.5) +
xlab("Passenger Class") +
ylab("Survival Percentage") +
ggtitle("Survival Percentage by Passenger Class") +
scale_y_continuous(limits = c(0, 0.70),
breaks = seq(0, .7, by = 0.10),
labels = paste(seq(0, 70, by = 10), "%", sep="")) +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5)) # Centers title
Data is total steel produced annually from the 5 largest steel producing countries
steel <- read.csv("data/steel.csv")
Make the plot. geom_area() is the function that produces the stacked area plot.
ggplot(steel, aes(x=year, y=Steel, fill=Country)) +
geom_area() + ggtitle("Annual Steel Produced, Top 5 Countries (Million Tons)") + ylab("Steel Produced (Million Tons)") + xlab("Year")
ggplot(steel, aes(fill=Country, y=Steel, x=year)) +
geom_bar(position="fill", stat="identity") + ggtitle("Share of Steel Produced Annually") + ylab("% of Steel Produced") + xlab("Year")
In the Boxplot section add two basic boxplot examples, one utilizing base R and one utilizing ggplot2. Note that even basic examples should have appopriate titles and axis labels.
Base R:
# Some data to work with:
data(ChickWeight)
#Boxplot:
boxplot(ChickWeight$weight ~ ChickWeight$Time,
main = "Chick Age versus Chick Weight",
xlab = "Chick Age in Days",
ylab = "Chick Weight in Grams")
ggplot2:
p <- ggplot(ChickWeight, aes(group=Time, y=weight)) +
geom_boxplot() +
labs(title="Chick Age versus Chick Weight",x="Chick Age in Days", y = " Chick Weight in Grams")+
theme_classic()
p
Let’s pull the NBA Players data
nba_players <- read_csv("data/nba_players.csv")
Pulling data from OKC, UTA, and CHI
team <- c(nba_players$TEAM[nba_players$TEAM %in% c("OKC", "UTA", "CHI")])
pts <- c(nba_players$PTS[nba_players$TEAM %in% c("OKC", "UTA", "CHI")])
data <- data.frame(team, pts)
with(data , vioplot(
pts[team=="OKC"] , pts[team=="UTA"], pts[team=="CHI"],
col= c(255,234,268) , names=c("OKC","UTA","CHI"),
xlab = "Teams",
ylab = "Points",
main = "NBA Points Per Team",
ylim = c(0,27),
border = c(sample(200:900, 3)),
rectCol = c(893)
))
### Basic Plot with ggplot2 ### I did add a boxplot on top of the violin plot to make it resemble the base r graph.
ggplot(data, aes(x= team, y = pts, fill = team)) +
geom_violin(trim = F, show.legend = T) +
geom_boxplot(width = 0.1)
Here we can use the alpha() function from the scales package to set an alpha (transparency) level for the color, ranging from 0 (totally transparent) to 1 (totally translucent). method - 'jitter' adds random noise in the y-dimension and the jitter() function applied to the variable being plotted adds random noise to the actual values of the variable. amount = 0 causes that random noise to be z/50 where z is the range of the variable.
stripchart(jitter(iris$Sepal.Width, amount = 0),
main = "Sepal Width Distribution",
xlab = "Sepal Width (cm)",
col= alpha("blue", 0.5),
pch = 19,
method= 'jitter')
With ggplot2’s geom_jitter() is a shortcut for geom_point(position = "jitter"). Its width and height parameters control the amount of random noise in the x and y dimensions, respectively. With ggplot2 we can set alpha directly without the use of any helper functions. However, with ggplot2 it is a bit tricky to make a strip chart for just one variable (without a second variable for grouping), as we need to set the y variable, set axis limits, and suppress the axis ticks and labels for the unused axis.
ggplot(data = iris) +
geom_jitter(aes(x=Sepal.Width, y=0),
width = 0.05,
height = 0.2,
color = "blue",
alpha = 0.5,
pch = 19,
cex = 2) +
scale_y_continuous(limits = c(-1, 1),breaks = NULL, labels = NULL) +
labs(title = "Sepal Width Distribution",
y = NULL,
x = "Sepal Width (cm)")
This is fairly easy to do with ggplot2, as geom_jitter() by default takes two variables as parameters. One can be the categorical group, iris species.
ggplot(data = iris) +
### Add Boxplot on top of the strip plot for iris species. - used https://www.youtube.com/watch?v=mbvU8fF4eGM&t=613s as a reference
geom_boxplot(mapping = aes(x = Sepal.Width, y = Species, fill = Species),
alpha = 0.25,
width = 0.5,
show.legend = FALSE) +
geom_jitter(aes(y=Species, x=Sepal.Width, color = Species),
width = 0.05,
height = 0.2,
pch = 19,
cex = 2) +
labs(title = "Sepal Width Distribution by Iris Species",
x = "Sepal Width (cm)",
y = NULL) +
theme_bw() +
theme(legend.position = "none")
Strip charts are particularly useful with small datasets. The NBA data has a small number of players per team, so let’s try to use strip charts to compare the distribution of scoring on NBA teams, similar to the comparisons in the violin plot section.
nba_players %>%
filter(TEAM %in% c("OKC", "UTA", "CHI")) %>%
ggplot(aes(x =PTS, y = TEAM)) +
geom_point() +
labs(title = "Scoring Distribution on Selected NBA Teams")
top_ten <- nba_players %>%
filter(TEAM %in% c("OKC", "UTA", "CHI", "WAS")) %>%
group_by(TEAM) %>%
arrange(desc(PTS), .by_group = TRUE) %>%
mutate(rn = row_number()) %>%
filter(rn <= 10)
top_one <- nba_players %>%
filter(TEAM %in% c("OKC", "UTA", "CHI", "WAS")) %>%
group_by(TEAM) %>%
filter(row_number(desc(PTS)) == 1)
ggplot(data = top_ten, aes(x =PTS, y = TEAM)) +
geom_point() +
scale_x_continuous(breaks = seq(5, 30, 5)) +
geom_text(aes(label = PLAYER),
data = top_one,
nudge_x = 0.25,
angle = 90,
alpha = 0.5,
vjust = "top",
hjust = "middle") +
labs(title = "Scoring Distribution of Top 10 Scorers") +
theme_bw() +
theme(panel.grid.major.y = element_blank()) +
geom_boxplot(alpha = 0,
linetype = 3)
This example shows a strip plot to compare chick weights with different types of feed.
#in ggplot2:
ggplot(data = chickwts) +
geom_jitter(aes(y=feed, x=weight, color = feed),
width = 0.05,
height = 0.2,
alpha = 0.5,
pch = 19,
cex = 2) +
labs(title = "Chick Weight by Feed Type",
x = "Chick Weight in Grams",
y = "Feed Type") +
theme_bw() +
theme(legend.position = "none")
A second example, this time using the Carbon Dioxide Uptake in Grass Plants dataset
ggplot(data = CO2) +
geom_jitter(aes(y=Treatment, x=uptake, color = Treatment),
width = 0.05,
height = 0.2,
alpha = 0.5,
pch = 19,
cex = 2) +
labs(title = "Carbon Dioxide Uptake in Grass Plants by Treatment Type",
x = "Carbon Dioxide Uptake",
y = "Treatment Type") +
theme_bw() +
theme(legend.position = "none")
A quantile-quantile (“QQ”) plot is used to compare an empirical distribution to a theoretical distribution by plotting the observed quantiles vs the theoretical quantiles. They are typically used to evaluate whether or not a variable is normally distributed.
The built-in rivers dataset gives the lengths (in miles) of 141 rivers in North America, as compiled by the US Geological Survey. Let’s see if the river lengths are normally distributed by creating a normal probability plot of the river lengths.
Here we create a basic QQ plot in base R and add the normal distribution QQ plot as a reference line for comparison.
qqnorm(rivers,
main = "Q-Q Plot of North American River Lengths")
qqline(rivers, col = "red")
Here we produce a similar plot using ggplot2. First we need to put the river lengths in a data frame.
rivers_df <- data.frame(length = rivers)
Now we make the plot. Note that the shape and size of the geom are revised to better match the base R plot.
ggplot(rivers_df, aes(sample = length)) +
stat_qq(shape = 1, cex = 2) +
stat_qq_line(color = "red") +
labs(title = "Q-Q Plot of North American River Lengths",
x = "Theoretical Quantiles",
y = "Sample Quantiles") +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5)) # Centers title
First, let’s read in some data on passengers who were aboard the Titanic.
titanic <- read.csv("data/titanic.csv")
Next we will mosaic plot of Titanic ticket class vs embarkment location. First check to see if there is any missing data.
table(titanic$Pclass)
##
## 1 2 3
## 186 173 355
table(titanic$Embarked)
##
## C Q S
## 2 130 28 554
Notice that there are two cases for which blank. We will remove those cases. Also, since Pclass is an integer vector it would be best to change it to a factor, and then specify the levels as a character vector. (See ?factor)
mask <- titanic$Embarked %in% c("C", "Q", "S")
embarked <- titanic$Embarked[mask]
class <- titanic$Pclass[mask]
class <- factor(class, levels = c(1, 2, 3),
labels = c("First", "Second", "Third"))
Make the mosaic plot.
mosaicplot(table(class, embarked),
main="Passenger Class by Embarkment Site",
xlab="Class",ylab="Embarkment Site",
col=c(2,3,4))
Let’s make a similar mosaic plot using ggplot2. We need to use the ggmosaic package to do so.
titanic_clean <- titanic %>%
filter(Embarked %in% c("C", "Q", "S")) %>% # Removes cases with missing data
mutate(
Embarked = factor(embarked),
Pclass = factor(Pclass)
)
titanic_m_data <- titanic_clean %>%
count(Embarked, Pclass)
ggplot(data = titanic_m_data) +
geom_mosaic(aes(x = product(Pclass), fill = Embarked, weight = n)) +
xlab("Passenger Class") +
ylab("Embarkment Site") +
ggtitle("Passenger Class by Embarkment Site")
Creating a scatterplot in base R is simple. For this example, I used the dataset pennsylvania-history, which shows the history of pennsylvania’s recorded covid cases from January to March 7. The variables of interest are positive, the number of positive cases, and deaths. Did the death count rise as cases grew?
First, we read the data file, pennsylvania-history.csv and store it as covidchart. Our x values in the plot function are covidchart\(positive, and our yvalue is covidchart\)death. Our title, as explaieid in main is “Covid Infection and Fatalities.” Our x-axis is labeled Infections, for covid infections, and our y axis is labeled fatalities. Note that these fatalities are not explciitly confirmed to be from Covid, but are the overall fatality count in PA over time, each point representing a different count of infection and fatality per day. We then draw a regression line based on positive cases and fatalities. The regression line is colored red and the points are colored grey after the color scheme of the virus itself.
covidchart <- read.csv("data/pennsylvania-history.csv")
covidchart <- subset(covidchart, !is.na(positive) & !is.na(death))
plot(covidchart$positive, covidchart$death, main = "Covid Infection and Fatalities", xlab = "Infections", ylab = "Fatalities", col = "grey")
abline(lm(covidchart$death ~ covidchart$positive), col = "red")
We can see that, yes, positive cases unfortunately have a positive correlation with fatalities.
Another way to color your plot is by using rcolorbrewer. We first have to load rcolorbrewer. Note that we will be using an example barplot instead of a scatterplot.
dummy <- data.frame(colno = c("col1", "col2", "col3", "col4", "col5", "col6", "col7", "col8", "col9"), colcount = c(1, 2, 3, 4, 5, 6, 7, 8, 9))
Then, execute this to see all the available palettes.
display.brewer.all()
To view a single palette, type
display.brewer.pal(n, name)
name denotes the palette you want to display, while n denotes how many colors are in that palette. For example, the following code shows you the first four colors in the Pastel1 palette.
display.brewer.pal(4, "Pastel1")
brewer.pal(n, name)
returns the hex codes of the n colors in the named palette as a vector. Here we create a palette of all 9 colors in Pastel1 palette.
Past <- brewer.pal(9, "Pastel1")
From there, go back to your plot’s color argument and use the following code to set the colors of the bars to the first color in the “Past” palette we made with RColorBrewer in the code chunk above.
barplot(height = dummy$colcount, names = dummy$colno, xlab = "colors", ylab = "count", ylim = c(0,9), col = Past[1])
Alternatively, you could use the following code use every color in the palette to color the bars.
barplot(height = dummy$colcount, names = dummy$colno, xlab = "colors", ylab = "count", ylim = c(0,9), col = Past)
ggplot2 is much the same. we set covidchart as the source of our data and positive and death as our x and y values in aes(). geom_point() is called to make ggplot graph this as a scatterplot. The labs() function labels our x and y axes, and the graph overall. To add a regression line, we use geom_smooth(method = lm).
ggplot(data = covidchart, aes(positive, death)) + geom_point(colour = "red") + labs(x = "Infections", y = "Fatalities", title = "Covid Infection and Fatalities") + geom_smooth(method = lm)
In order to use an RColorBrewer palette in a barplot in ggplot2, you have to first save your barplot in a variable, and then append the function scale_fill_brewer() to it, with the argument palette =“palettename” inside the function. Don’t forget to add code setting “fill” equal to your x variables (i.e. your bars), otherwise you’ll get a plain grey bar graph. Done correctly, code should look like this:
ggplot(data = dummy, aes(x = colno, y = colcount, fill = colno)) +
geom_bar(stat = "identity") +
labs(x = "ColorNumber", y = "ColorCount", title = "Color Dummy Data") +
scale_fill_brewer(palette = "Pastel1")
Base R plots require a bit finessing to get just right.
First of all, I plan to place the legend outside the plot itself so I must first change the margins. This is done with par() which can set graphical parameters. Within par(), mar takes a vector input of 4 numbers determining how far into the pane the plot margins should be, starting at the bottom and continuing clockwise. xpd takes a logical value & setting it to TRUE tells R not to clip the legend or any other insertion outside the plot bounds. inset=c(-0.35,0) tells R to move the legend away from the plot, outside the margin horizontally. To make the legend look a bit cleaner, use bty='n' to remove the box around the legend.
To set the point size in base plot, use cex=data$variable. In this case cex=mtcars$disp. For the legend, use pt.cex=c(n1, n2, n3...nx) for however many legend items you have. I’m choosing intervals of 100 from 100 to 400.
par(mar=c(4, 5, 2, 8), xpd = T) # bottom, left, top, right
plot(mtcars$mpg, mtcars$hp, cex = mtcars$disp,
pch=16,
main = 'Miles per Gallon vs Gross Horsepower with Displacement',
xlab = 'miles per gallon',
ylab = 'gross horspower')
legend('right', inset=c(-0.35,0),
legend=c('100', '200', '300', '400'),
title = 'Displacement\n(cu. in.)',
pch=16, ncol=1, pt.cex = c(100, 200, 300, 400),
bty = 'n')
Yikes, it seems like the points may be a bit too large. Lets make them all smaller by dividing their values by 100 & take care not to forget about the legend points as well by changing the values for pt.cex.
par(mar=c(4, 5, 2, 8), xpd = T) # bottom, left, top, right
plot(mtcars$mpg, mtcars$hp, cex = mtcars$disp/100,
pch=16,
main = 'Miles per Gallon vs Gross Horsepower with Displacement',
xlab = 'miles per gallon',
ylab = 'gross horspower')
legend('right', inset=c(-0.35,0),
legend=c('100', '200', '300', '400'),
title = 'Displacement\n(cu. in.)',
pch=16, ncol=1, pt.cex = c(100, 200, 300, 400)/100,
bty = 'n')
Better, but the points don’t seem to be proportional. It seems that cex scales points by altering the radius of a point. This would cause the area of each consecutive point to be exponentially larger than the last when using the formula \(A=πr^2\). To correct this and make the point size proportional to the area, rearrange the formula to solve for the radius that produces the desired area. \(r=\sqrt{\frac{A}{π}}\)
Now the points are proportional in area but still too large, so I divide the areas by 5.
par(mar=c(4, 5, 2, 8), xpd = T) # bottom, left, top, right
plot(mtcars$mpg, mtcars$hp, cex = sqrt(mtcars$disp/3.14)/5,
pch=16,
main = 'Miles per Gallon vs Gross Horsepower with Displacement',
xlab = 'miles per gallon',
ylab = 'gross horspower')
legend('right', inset=c(-0.35,0),
legend=c('100', '200', '300', '400'),
title = 'Displacement\n(cu. in.)',
pch=16, ncol=1, pt.cex = sqrt(c(100, 200, 300, 400)/3.14)/5,
bty = 'n')
Now it looks right!
When I need to make a plot, I usually prefer ggplot() over base r for its simplicity. As you can see, I was able to make a clean graph with just 5 lines of code. To show displacement as a variable point size, you set size=disp within the aesthetics of ggplot. Ggplot automatically scales by area & makes the points reasonable sizes as well. Because mtcars is set as the data source, there is no need to reference the data frame for each variable.
ggplot(mtcars, aes(x=mpg, y=hp, size=disp))+
geom_point()+
ggtitle('Miles per Gallon vs Gross Horsepower with Displacement')+
labs(size = 'Displacement\n(cu. in.)', x='miles per gallon', y='gross horsepower')+
theme_classic()
data(swiss)
#Data Preparation
swiss$`municipality` <- rownames(swiss) # create new column for municipality names
swiss$Fertility_z <- round((swiss$Fertility - mean(swiss$Fertility))/sd(swiss$Fertility), 2) # compute normalized fertility
swiss$Fertility_type <- ifelse(swiss$Fertility_z < 0, "below", "above") # above / below avg flag
swiss <- swiss[order(swiss$Fertility_z), ] # sort
swiss$`municipality` <- factor(swiss$`municipality`, levels = swiss$`municipality`) # convert to factor to retain sorted order in plot.
#Diverging Barplot:
ggplot(swiss, aes(x=`municipality`, y=Fertility_z, label=Fertility_z)) +
geom_bar(stat='identity', aes(fill=Fertility_type), width=.5) +
theme(text = element_text(size=9)) +
scale_fill_manual(name="Fertility",
labels = c("Above Average", "Below Average"),
values = c("above"="#00ba38", "below"="#f8766d")) +
labs(title= "Divergence in the Average Fertility in Swiss Municipalities",
x = "Municipality",
y = 'Fertility') +
coord_flip()
To make a lollipop chart with ggplot2 you can use geom_point() for the circle and geom_segment() for the stem.
let <- data.frame(
x=LETTERS[1:10],
y=runif(10, 2, 9)
)
ggplot(let, aes(x=x, y=y)) +
geom_segment( aes(x=x, xend=x, y=0, yend=y), color="black") +
geom_point( size=4, color="blue",
fill=alpha("purple", 0.2), alpha=.8, shape=21, stroke=4) +
theme_gray() +
theme(
panel.grid.major.x = element_blank(),
panel.border = element_blank(),
axis.ticks.x = element_blank()
) +
xlab("Student") +
ylab("Hours Spent Studying") +
ggtitle("Student Hours of Study (per Week)") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_y_continuous(limits = c(0, 10))
Use the state data that comes with R. Let’s look at state life expectancy. First, prepare the data that we will use for a couple examples.
state_df <- data.frame(state.x77, Region=state.region, Division=state.division)
state_df <- rownames_to_column(state_df, var="State")
state_df$Life.Exp.Z <-
(state_df$Life.Exp - mean(state_df$Life.Exp))/sd(state_df$Life.Exp)
state_df$Above.Avg <- ifelse(state_df$Life.Exp.Z >= 0, "yes", "no")
state_df <- state_df[order(state_df$Life.Exp.Z), ]
state_df$State <- factor(state_df$State, levels = state_df$State)
Next create a simple diverging lollipop chart to highlight the difference of each state’s mean life expectancy from the mean state mean life expectancy.
ggplot(state_df, aes(x=State, y=Life.Exp)) +
geom_point(stat='identity', fill="black", size=2) +
geom_segment(aes(y = mean(Life.Exp),
x = State,
yend = Life.Exp,
xend = State),
color = "black") +
labs(title="State Life Expectancy",
subtitle="Deviation from Mean State Life Expectancy",
y = "Life Expectancy (Years)",
x = NULL) +
scale_y_continuous(breaks = seq(68, 74, 1)) +
coord_flip()
Now let’s try to view a subset. With a smaller amount of data we can add the life expectancies as labels on the dots.
ggplot(filter(state_df, Region == "South"), aes(x=State, y=Life.Exp.Z, label=round(Life.Exp, 1))) +
geom_hline(yintercept = 0, color = "darkgray") +
geom_point(stat='identity', aes(col=Above.Avg), size=6) +
geom_text(color="white", size=2.2) +
labs(y = "Standard Deviations from Mean State Life Expectancy") +
geom_text(aes(x = "West Virginia",
y = -0.05,
label = "Mean State Life Expectancy"),
angle = 90,
color = "darkgray") +
labs(title="Life Expectancy in the Southern States",
subtitle="Compared to Average State Life Expectancy") +
coord_flip() +
theme_bw() +
theme(axis.title.y=element_blank()) +
theme(legend.position = "none")
data <- read.csv("data/titanic.csv", TRUE, sep = ",")
slices <- table(data$Pclass)
## ^^ we use the table() function to get the count of each class
lbls <- c("1st Class", "2nd Class", "3rd Class")
pct <- round(slices/sum(slices)*100)
lbls <- paste(lbls, pct) #this adds the percentage to the labels
lbls <- paste(lbls,"%",sep="")
pie(slices, labels = lbls, main = "Passenger Class Distribution", col = rainbow(length(lbls)))
#nums <- read.csv("data/titanic.csv")
#ClassCount <- as.data.frame(table(nums$Pclass))
#I tried to get the data from a csv but I was genuinely unable to
ClassCount <- data.frame(
Class = c("1st", "2nd", "3rd"),
n = c(186, 173, 355),
prop = c(26.1, 24.2, 49.7)
)
ClassCount <- ClassCount %>%
arrange(desc(Class)) %>%
mutate(lab.ypos = cumsum(prop) -0.5*prop)
## the cumsum(prop) - 0.5*prop helps us put the labels in the center of the portion
mycols <- rainbow(length(ClassCount$Class))
ggplot(ClassCount, aes(x = "", y = prop, fill = Class)) +
geom_bar(width = 1, stat = "identity", color = "black") +
coord_polar("y", start = 0) +
geom_text(aes(y = lab.ypos, label = prop), color = "black") +
scale_fill_manual(values = mycols) + theme_void()
Create a chloropleth using color to show population density of European countries. First, download geojson data using the geojson_read() function from the geojsonio package. what = "sp" instructs the function to return the data as a SpatialPolygonsDataFrame type. This type contains vertices data that can be used to draw polygons, as well as additional information associated with the polygon. In this case the polygons are European countries and the associated data includes population and area data from 2005.
spdf <- geojson_read("https://raw.githubusercontent.com/leakyMirror/map-of-europe/master/GeoJSON/europe.geojson", what = "sp")
class(spdf)
## [1] "SpatialPolygonsDataFrame"
## attr(,"package")
## [1] "sp"
The SpatialPolygonsDataFrame is implemented as an S4 class in R, so we can extract the data attribute (data associated with the polygons) with @. The data attribute is a data frame.
spdf_data <- spdf@data %>%
mutate(id = row_number())
To use the geospatial data with ggplot we first need to transform it into a data frame using the tidy() function of the broom package. The strtoi() function here is converting the id field in the resulting dataframe to an integer, so that it is the same type as the id field in spdf_data.
spdf_tibble <- broom::tidy(spdf) %>%
mutate(id = strtoi(id))
Merge the geospatial data with the data about the polygons (countries).
# Make the merge
spdf_tibble = spdf_tibble %>%
left_join(. , spdf_data, by="id")
Plot the countries, with color determined by the country’s population.
ggplot() +
geom_polygon(data = spdf_tibble,
aes(fill = POP2005,
x = long,
y = lat,
group = group), size=0, alpha=0.9) +
theme_void() +
coord_map()
Do the same, but use the color palette from the
viridis package.
ggplot() +
geom_polygon(data = spdf_tibble,
aes(fill = POP2005,
x = long,
y = lat,
group = group )) +
scale_fill_viridis() +
labs(title = "Population") +
theme_void() +
coord_map()